In [1]:
# hail imports

from pprint import pprint
from bokeh.io import output_notebook, show
from bokeh.layouts import gridplot
from bokeh.models import Span
from bokeh.plotting import figure, show, output_file
import pandas as pd
import os , sys, time
import numpy as np

import hail as hl
from bokeh.io import output_notebook, show
In [2]:
hl.init()
output_notebook()
Running on Apache Spark version 2.4.1
SparkUI available at http://cerbalabserver:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.31-6060f9c971cc
LOGGING: writing to /home/mrinalv/project/hail-20200525-0007-0.2.31-6060f9c971cc.log
Loading BokehJS ...
In [3]:
#####################################
# importing maually parsed vcf file
#####################################

geno = hl.import_vcf('/media/CERBADATA/vc_proj/all_outputs/joint_vcf/vep/manually_parsed_2_snp_annotated.vcf')

#geno = hl.read_matrix_table('/media/CERBADATA/vc_proj/ALL_COHORTS.jc.vcf.FILTERED.SELECTED.mt')
In [4]:
# for resetting the variable, if something goes wrong along the way
#%reset_selective geno
In [5]:
geno.describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        DP: int32, 
        DS: bool, 
        END: int32, 
        ExcessHet: float64, 
        FS: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQRankSum: float64, 
        QD: float64, 
        RAW_MQandDP: array<int32>, 
        ReadPosRankSum: float64, 
        SOR: float64, 
        CSQ: array<str>, 
        gnomAD_AF: float64, 
        gnomAD_AFR_AF: float64, 
        gnomAD_AMR_AF: float64, 
        gnomAD_EAS_AF: float64, 
        gnomAD_SAS_AF: float64, 
        gnomAD_NFE_AF: float64, 
        gnomAD_FIN_AF: float64, 
        worst_csq: str, 
        gene_name: str, 
        clinvar_significane: str, 
        nonref_AF: float64
    }
----------------------------------------
Entry fields:
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'GT': call
    'MIN_DP': int32
    'PGT': call
    'PID': str
    'PL': array<int32>
    'PS': int32
    'RGQ': int32
    'SB': array<int32>
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
In [6]:
# Sample QC
geno = hl.sample_qc(geno)
In [7]:
sn = hl.plot.histogram(geno.info.AN)
2020-05-25 00:10:38 Hail: INFO: Coerced sorted dataset
2020-05-25 00:15:11 Hail: INFO: Coerced sorted dataset
In [8]:
show(sn)
In [9]:
# qc before filtering

# sample_qc plots

n_hom_var = hl.plot.histogram(geno.sample_qc.n_hom_var)
n_singleton = hl.plot.histogram(geno.sample_qc.n_singleton)
n_snp = hl.plot.histogram(geno.sample_qc.n_snp)
n_transition = hl.plot.histogram(geno.sample_qc.n_transition)
n_transversion = hl.plot.histogram(geno.sample_qc.n_transversion)
n_star = hl.plot.histogram(geno.sample_qc.n_star)
r_ins_del = hl.plot.histogram(geno.sample_qc.r_insertion_deletion)
n_delet = hl.plot.histogram(geno.sample_qc.n_deletion)
n_insert = hl.plot.histogram(geno.sample_qc.n_insertion)
sample_call_rate = hl.plot.histogram(geno.sample_qc.call_rate)
titv = hl.plot.histogram(geno.sample_qc.r_ti_tv)
hethom = hl.plot.histogram(geno.sample_qc.r_het_hom_var)
nonrefs = hl.plot.histogram(geno.sample_qc.n_non_ref)

show(nonrefs)
show(hethom)
show(titv)
show(sample_call_rate)
show(n_insert)
show(n_delet)
show(r_ins_del)
show(n_hom_var)
show(n_singleton)
show(n_snp)
show(n_transition)
show(n_transversion)
show(n_star)
2020-05-25 00:15:58 Hail: INFO: Coerced sorted dataset
2020-05-25 00:16:50 Hail: INFO: Coerced sorted dataset
2020-05-25 00:17:07 Hail: INFO: Coerced sorted dataset
2020-05-25 00:18:01 Hail: INFO: Coerced sorted dataset
2020-05-25 00:18:18 Hail: INFO: Coerced sorted dataset
2020-05-25 00:19:13 Hail: INFO: Coerced sorted dataset
2020-05-25 00:19:30 Hail: INFO: Coerced sorted dataset
2020-05-25 00:20:24 Hail: INFO: Coerced sorted dataset
2020-05-25 00:20:40 Hail: INFO: Coerced sorted dataset
2020-05-25 00:21:43 Hail: INFO: Coerced sorted dataset
2020-05-25 00:21:59 Hail: INFO: Coerced sorted dataset
2020-05-25 00:23:00 Hail: INFO: Coerced sorted dataset
2020-05-25 00:23:17 Hail: INFO: Coerced sorted dataset
2020-05-25 00:24:17 Hail: INFO: Coerced sorted dataset
2020-05-25 00:24:34 Hail: INFO: Coerced sorted dataset
2020-05-25 00:25:35 Hail: INFO: Coerced sorted dataset
2020-05-25 00:25:51 Hail: INFO: Coerced sorted dataset
2020-05-25 00:26:52 Hail: INFO: Coerced sorted dataset
2020-05-25 00:27:09 Hail: INFO: Coerced sorted dataset
2020-05-25 00:28:09 Hail: INFO: Coerced sorted dataset
2020-05-25 00:28:26 Hail: INFO: Coerced sorted dataset
2020-05-25 00:29:26 Hail: INFO: Coerced sorted dataset
2020-05-25 00:29:43 Hail: INFO: Coerced sorted dataset
2020-05-25 00:30:43 Hail: INFO: Coerced sorted dataset
2020-05-25 00:31:00 Hail: INFO: Coerced sorted dataset
2020-05-25 00:32:00 Hail: INFO: Coerced sorted dataset
2020-05-25 00:32:17 Hail: INFO: Coerced sorted dataset
2020-05-25 00:33:17 Hail: INFO: Coerced sorted dataset
2020-05-25 00:33:33 Hail: INFO: Coerced sorted dataset
2020-05-25 00:34:32 Hail: INFO: Coerced sorted dataset
2020-05-25 00:34:47 Hail: INFO: Coerced sorted dataset
2020-05-25 00:35:46 Hail: INFO: Coerced sorted dataset
2020-05-25 00:36:02 Hail: INFO: Coerced sorted dataset
2020-05-25 00:37:01 Hail: INFO: Coerced sorted dataset
2020-05-25 00:37:16 Hail: INFO: Coerced sorted dataset
2020-05-25 00:38:15 Hail: INFO: Coerced sorted dataset
2020-05-25 00:38:30 Hail: INFO: Coerced sorted dataset
2020-05-25 00:39:19 Hail: INFO: Coerced sorted dataset
2020-05-25 00:39:33 Hail: INFO: Coerced sorted dataset
2020-05-25 00:40:03 Hail: INFO: Coerced sorted dataset
2020-05-25 00:40:50 Hail: INFO: Coerced sorted dataset
2020-05-25 00:41:05 Hail: INFO: Coerced sorted dataset
2020-05-25 00:41:33 Hail: INFO: Coerced sorted dataset
2020-05-25 00:42:32 Hail: INFO: Coerced sorted dataset
2020-05-25 00:42:46 Hail: INFO: Coerced sorted dataset
2020-05-25 00:43:45 Hail: INFO: Coerced sorted dataset
2020-05-25 00:43:59 Hail: INFO: Coerced sorted dataset
2020-05-25 00:44:48 Hail: INFO: Coerced sorted dataset
2020-05-25 00:45:02 Hail: INFO: Coerced sorted dataset
2020-05-25 00:45:50 Hail: INFO: Coerced sorted dataset
2020-05-25 00:46:05 Hail: INFO: Coerced sorted dataset
2020-05-25 00:46:53 Hail: INFO: Coerced sorted dataset
2020-05-25 00:47:08 Hail: INFO: Coerced sorted dataset
2020-05-25 00:47:56 Hail: INFO: Coerced sorted dataset
In [10]:
# variant qc

geno = hl.variant_qc(geno)
In [11]:
# variant qc fields

call_rate = hl.plot.histogram(geno.variant_qc.call_rate)
n_called = hl.plot.histogram(geno.variant_qc.n_called)
n_not_called = hl.plot.histogram(geno.variant_qc.n_not_called)
n_het = hl.plot.histogram(geno.variant_qc.n_het)
n_non_ref = hl.plot.histogram(geno.variant_qc.n_non_ref)
het_freq_hwe = hl.plot.histogram(geno.variant_qc.het_freq_hwe)
p_value_hwe = hl.plot.histogram(geno.variant_qc.p_value_hwe)

show(call_rate)
show(n_called)
show(n_not_called)
show(n_het)
show(n_non_ref)
show(het_freq_hwe)
show(p_value_hwe)
2020-05-25 00:48:11 Hail: INFO: Coerced sorted dataset
2020-05-25 00:48:59 Hail: INFO: Coerced sorted dataset
2020-05-25 00:49:46 Hail: INFO: Coerced sorted dataset
2020-05-25 00:50:33 Hail: INFO: Coerced sorted dataset
2020-05-25 00:51:20 Hail: INFO: Coerced sorted dataset
2020-05-25 00:52:07 Hail: INFO: Coerced sorted dataset
2020-05-25 00:52:54 Hail: INFO: Coerced sorted dataset
2020-05-25 00:53:42 Hail: INFO: Coerced sorted dataset
2020-05-25 00:54:29 Hail: INFO: Coerced sorted dataset
2020-05-25 00:55:17 Hail: INFO: Coerced sorted dataset
2020-05-25 00:56:04 Hail: INFO: Coerced sorted dataset
2020-05-25 00:56:52 Hail: INFO: Coerced sorted dataset
2020-05-25 00:57:40 Hail: INFO: Coerced sorted dataset
2020-05-25 00:58:28 Hail: INFO: Coerced sorted dataset
In [12]:
##### filtering step
# mean sample_GQ >= 30
# sample r_het_hom_vars in [1,2]
# sample r_ti_tv > 2
# sample r_insertion_deletion in [0.5, 1.2]

#geno.filter_cols(geno.sample_qc.dp_stats.mean >= 4)

geno = geno.filter_cols(geno.sample_qc.r_het_hom_var > 1)
geno = geno.filter_cols(geno.sample_qc.r_het_hom_var < 2)
geno = geno.filter_cols(geno.sample_qc.r_ti_tv >2)
geno = geno.filter_cols(geno.sample_qc.r_insertion_deletion >0.5)
geno = geno.filter_cols(geno.sample_qc.r_insertion_deletion <1.2)

geno = geno.filter_cols(geno.sample_qc.gq_stats.mean >= 30)
In [13]:
# All qc plots\

# sample_qc
geno = hl.sample_qc(geno)

r_ins_del_a = hl.plot.histogram(geno.sample_qc.r_insertion_deletion)
titv_a = hl.plot.histogram(geno.sample_qc.r_ti_tv)
hethom_a = hl.plot.histogram(geno.sample_qc.r_het_hom_var)
qc_jgt_a = hl.plot.histogram(geno.sample_qc.gq_stats.mean, range=(10,70), bins=100, title='Mean Sample GQ Histogram for JGT', legend='Mean Sample GQ')

show(r_ins_del_a)
show(titv_a)
show(hethom_a)
show(qc_jgt_a)
2020-05-25 00:59:17 Hail: INFO: Coerced sorted dataset
2020-05-25 01:00:24 Hail: INFO: Coerced sorted dataset
2020-05-25 01:01:25 Hail: INFO: Coerced sorted dataset
2020-05-25 01:01:40 Hail: INFO: Coerced sorted dataset
2020-05-25 01:02:46 Hail: INFO: Coerced sorted dataset
2020-05-25 01:03:45 Hail: INFO: Coerced sorted dataset
2020-05-25 01:04:00 Hail: INFO: Coerced sorted dataset
2020-05-25 01:05:06 Hail: INFO: Coerced sorted dataset
2020-05-25 01:06:05 Hail: INFO: Coerced sorted dataset
2020-05-25 01:06:20 Hail: INFO: Coerced sorted dataset
2020-05-25 01:07:27 Hail: INFO: Coerced sorted dataset
2020-05-25 01:08:27 Hail: INFO: Coerced sorted dataset
2020-05-25 01:08:41 Hail: INFO: Coerced sorted dataset
2020-05-25 01:09:48 Hail: INFO: Coerced sorted dataset
2020-05-25 01:10:38 Hail: INFO: Coerced sorted dataset
2020-05-25 01:10:53 Hail: INFO: Coerced sorted dataset
2020-05-25 01:12:00 Hail: INFO: Coerced sorted dataset
2020-05-25 01:12:49 Hail: INFO: Coerced sorted dataset
2020-05-25 01:13:04 Hail: INFO: Coerced sorted dataset
2020-05-25 01:14:11 Hail: INFO: Coerced sorted dataset
2020-05-25 01:15:00 Hail: INFO: Coerced sorted dataset
In [14]:
# subsetting geno
geno_f_sub = geno.filter_rows(geno.variant_qc.call_rate >= 0.99)

geno_f_sub = geno_f_sub.filter_rows(geno_f_sub.variant_qc.n_non_ref >= 9.97)
In [15]:
#### Kinship analysis

pc_rel = hl.pc_relate(geno_f_sub.GT, 0.001, k=2, statistics='kin')
pairs = pc_rel.filter(pc_rel['kin'] > 0.125)
related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, keep=False)
geno = geno.filter_cols(hl.is_defined(related_samples_to_remove[geno.col_key]), keep=False)
2020-05-25 01:15:15 Hail: INFO: Coerced sorted dataset
2020-05-25 01:16:24 Hail: INFO: Coerced sorted dataset
2020-05-25 01:17:03 Hail: INFO: hwe_normalized_pca: running PCA using 1931 variants.
2020-05-25 01:17:18 Hail: INFO: Coerced sorted dataset
2020-05-25 01:18:27 Hail: INFO: Coerced sorted dataset
2020-05-25 01:19:06 Hail: INFO: pca: running PCA with 2 components...
2020-05-25 01:21:08 Hail: INFO: Coerced sorted dataset
2020-05-25 01:22:17 Hail: INFO: Coerced sorted dataset
2020-05-25 01:22:32 Hail: INFO: Coerced sorted dataset
2020-05-25 01:23:41 Hail: INFO: Coerced sorted dataset
2020-05-25 01:36:41 Hail: INFO: Wrote all 1 blocks of 1932 x 997 matrix with block size 4096.
2020-05-25 01:36:56 Hail: INFO: Coerced sorted dataset
2020-05-25 01:38:06 Hail: INFO: Coerced sorted dataset
2020-05-25 01:38:22 Hail: INFO: Coerced sorted dataset
2020-05-25 01:39:32 Hail: INFO: Coerced sorted dataset
2020-05-25 01:39:36 Hail: INFO: wrote matrix with 997 rows and 997 columns as 1 block of size 4096 to file:/tmp/hail.thYcsW4B6HfH/VHTLHzcLFC.bm
2020-05-25 01:39:37 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-05-25 01:39:40 Hail: INFO: wrote table with 304 rows in 1 partition to file:/tmp/hail.thYcsW4B6HfH/x8LKhkAd2K
2020-05-25 01:39:41 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
In [16]:
geno = hl.variant_qc(geno)
geno = geno.filter_rows(geno.variant_qc.n_non_ref > 0)
In [17]:
print('number of variants after quality filtering')
print(geno.count_rows())
print('number of samples remaining after quality filtering')
print(geno.count_cols())
number of variants after quality filtering
2020-05-25 01:39:57 Hail: INFO: Coerced sorted dataset
2020-05-25 01:41:06 Hail: INFO: Coerced sorted dataset
2020-05-25 01:42:15 Hail: INFO: Coerced sorted dataset
2020-05-25 01:42:15 Hail: INFO: Coerced sorted dataset
2020-05-25 01:42:15 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-05-25 01:42:30 Hail: INFO: Coerced sorted dataset
1263481
number of samples remaining after quality filtering
2020-05-25 01:43:25 Hail: INFO: Coerced sorted dataset
2020-05-25 01:44:34 Hail: INFO: Coerced sorted dataset
2020-05-25 01:45:44 Hail: INFO: Coerced sorted dataset
2020-05-25 01:45:58 Hail: INFO: Coerced sorted dataset
2020-05-25 01:45:58 Hail: INFO: Coerced sorted dataset
2020-05-25 01:45:58 Hail: INFO: Ordering unsorted dataset with network shuffle
840
In [18]:
# correlation plots
geno = geno.annotate_rows(REX_AF = 1 - geno.variant_qc.AF[0])
In [19]:
geno.describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'sample_qc': struct {
        dp_stats: struct {
            mean: float64, 
            stdev: float64, 
            min: float64, 
            max: float64
        }, 
        gq_stats: struct {
            mean: float64, 
            stdev: float64, 
            min: float64, 
            max: float64
        }, 
        call_rate: float64, 
        n_called: int64, 
        n_not_called: int64, 
        n_filtered: int64, 
        n_hom_ref: int64, 
        n_het: int64, 
        n_hom_var: int64, 
        n_non_ref: int64, 
        n_singleton: int64, 
        n_snp: int64, 
        n_insertion: int64, 
        n_deletion: int64, 
        n_transition: int64, 
        n_transversion: int64, 
        n_star: int64, 
        r_ti_tv: float64, 
        r_het_hom_var: float64, 
        r_insertion_deletion: float64
    }
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        DP: int32, 
        DS: bool, 
        END: int32, 
        ExcessHet: float64, 
        FS: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQRankSum: float64, 
        QD: float64, 
        RAW_MQandDP: array<int32>, 
        ReadPosRankSum: float64, 
        SOR: float64, 
        CSQ: array<str>, 
        gnomAD_AF: float64, 
        gnomAD_AFR_AF: float64, 
        gnomAD_AMR_AF: float64, 
        gnomAD_EAS_AF: float64, 
        gnomAD_SAS_AF: float64, 
        gnomAD_NFE_AF: float64, 
        gnomAD_FIN_AF: float64, 
        worst_csq: str, 
        gene_name: str, 
        clinvar_significane: str, 
        nonref_AF: float64
    }
    'variant_qc': struct {
        dp_stats: struct {
            mean: float64, 
            stdev: float64, 
            min: float64, 
            max: float64
        }, 
        gq_stats: struct {
            mean: float64, 
            stdev: float64, 
            min: float64, 
            max: float64
        }, 
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        homozygote_count: array<int32>, 
        call_rate: float64, 
        n_called: int64, 
        n_not_called: int64, 
        n_filtered: int64, 
        n_het: int64, 
        n_non_ref: int64, 
        het_freq_hwe: float64, 
        p_value_hwe: float64
    }
    'REX_AF': float64
----------------------------------------
Entry fields:
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'GT': call
    'MIN_DP': int32
    'PGT': call
    'PID': str
    'PL': array<int32>
    'PS': int32
    'RGQ': int32
    'SB': array<int32>
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
In [20]:
q = hl.plot.scatter(geno.REX_AF,
                    geno.info.gnomAD_NFE_AF, xlabel='REX AF', ylabel='gnomAD NFE AF',
                    size=4)
show(q)
2020-05-25 01:46:14 Hail: INFO: Coerced sorted dataset
2020-05-25 01:47:23 Hail: INFO: Coerced sorted dataset
2020-05-25 01:48:32 Hail: INFO: Coerced sorted dataset
2020-05-25 01:48:32 Hail: INFO: Coerced sorted dataset
2020-05-25 01:48:32 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-05-25 01:48:47 Hail: INFO: Coerced sorted dataset
In [21]:
r = hl.plot.scatter(geno.REX_AF,
                    geno.info.gnomAD_AFR_AF, xlabel='REX AF', ylabel='gnomAD AFR AF',
                    size=4)
show(r)
2020-05-25 01:49:44 Hail: INFO: Coerced sorted dataset
2020-05-25 01:50:54 Hail: INFO: Coerced sorted dataset
2020-05-25 01:52:04 Hail: INFO: Coerced sorted dataset
2020-05-25 01:52:04 Hail: INFO: Coerced sorted dataset
2020-05-25 01:52:04 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-05-25 01:52:18 Hail: INFO: Coerced sorted dataset
In [22]:
s = hl.plot.scatter(geno.REX_AF,
                    geno.info.gnomAD_AMR_AF, xlabel='REX AF', ylabel='gnomAD AMR AF',
                    size=4)
show(s)
2020-05-25 01:53:15 Hail: INFO: Coerced sorted dataset
2020-05-25 01:54:25 Hail: INFO: Coerced sorted dataset
2020-05-25 01:55:35 Hail: INFO: Coerced sorted dataset
2020-05-25 01:55:35 Hail: INFO: Coerced sorted dataset
2020-05-25 01:55:35 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-05-25 01:55:50 Hail: INFO: Coerced sorted dataset
In [23]:
t = hl.plot.scatter(geno.REX_AF,
                    geno.info.gnomAD_EAS_AF, xlabel='REX AF', ylabel='gnomAD EAS AF',
                    size=4)
show(t)
2020-05-25 01:56:47 Hail: INFO: Coerced sorted dataset
2020-05-25 01:57:56 Hail: INFO: Coerced sorted dataset
2020-05-25 01:59:06 Hail: INFO: Coerced sorted dataset
2020-05-25 01:59:06 Hail: INFO: Coerced sorted dataset
2020-05-25 01:59:06 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-05-25 01:59:20 Hail: INFO: Coerced sorted dataset
In [24]:
u = hl.plot.scatter(geno.REX_AF,
                    geno.info.gnomAD_SAS_AF, xlabel='REX AF', ylabel='gnomAD SAS AF',
                    size=4)
show(u)
2020-05-25 02:00:17 Hail: INFO: Coerced sorted dataset
2020-05-25 02:01:26 Hail: INFO: Coerced sorted dataset
2020-05-25 02:02:36 Hail: INFO: Coerced sorted dataset
2020-05-25 02:02:36 Hail: INFO: Coerced sorted dataset
2020-05-25 02:02:36 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-05-25 02:02:50 Hail: INFO: Coerced sorted dataset
In [25]:
v = hl.plot.scatter(geno.REX_AF,
                    geno.info.gnomAD_FIN_AF, xlabel='REX AF', ylabel='gnomAD FIN AF',
                    size=4)
show(v)
2020-05-25 02:03:47 Hail: INFO: Coerced sorted dataset
2020-05-25 02:04:56 Hail: INFO: Coerced sorted dataset
2020-05-25 02:06:06 Hail: INFO: Coerced sorted dataset
2020-05-25 02:06:06 Hail: INFO: Coerced sorted dataset
2020-05-25 02:06:06 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-05-25 02:06:20 Hail: INFO: Coerced sorted dataset
In [26]:
w = hl.plot.scatter(geno.REX_AF,
                    geno.info.gnomAD_AF, xlabel='REX AF', ylabel='gnomAD global AF',
                    size=4)
show(w)
2020-05-25 02:07:17 Hail: INFO: Coerced sorted dataset
2020-05-25 02:08:28 Hail: INFO: Coerced sorted dataset
2020-05-25 02:09:37 Hail: INFO: Coerced sorted dataset
2020-05-25 02:09:38 Hail: INFO: Coerced sorted dataset
2020-05-25 02:09:38 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-05-25 02:09:51 Hail: INFO: Coerced sorted dataset
In [27]:
geno.info.AN = geno.variant_qc.AN
In [28]:
hl.export_vcf(geno, '/media/CERBADATA/vc_proj/all_outputs/joint_vcf/vep/geno_an.bgz')
2020-05-25 02:10:33 Hail: WARN: export_vcf: ignored the following fields:
    'sample_qc' (column)
    'variant_qc' (row)
    'REX_AF' (row)
2020-05-25 02:10:48 Hail: INFO: Coerced sorted dataset
2020-05-25 02:11:58 Hail: INFO: Coerced sorted dataset
2020-05-25 02:13:07 Hail: INFO: Coerced sorted dataset
2020-05-25 02:13:07 Hail: INFO: Coerced sorted dataset
2020-05-25 02:13:07 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-05-25 02:13:21 Hail: INFO: Coerced sorted dataset
2020-05-25 02:16:49 Hail: INFO: merging 1331 files totalling 3.2G...
2020-05-25 02:17:04 Hail: INFO: while writing:
    /media/CERBADATA/vc_proj/all_outputs/joint_vcf/vep/geno_an.bgz
  merge time: 15.150s
In [ ]:
 

overrepresentation analyses

reloading the vcf back after filtering, to extract p value

For overrepresentation analysis we need to do the following: 1) Select variants that are pathogenic or likely pathogenic (clinvar_significance is "p" or "l") 2) Calculate p-value based on binomial distribution. You'll need to annotate the rows of your matrixtable with a variable that you can call REX_AC or sth like it (it should be the total count of alternative allele at each position, like mt.variant_qc.AN - mt.variant_qc.AC[0]) The overrepresentation p-value should represent the binomial probability of sampling REX_AC allele in a sample of size mt.variant_qc.AN if the real allele frequency (success probability) was mt.info.gnomAD_AF

As far as I understand, you should be able to do a kind of vectorized operation with the mt if you do stuff in the annotat_rows

In [29]:
import scipy.stats as ss
import matplotlib.pyplot as plt
import re
import pandas as pd
from scipy.stats import binom
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels
import numpy as np
In [30]:
file = '/media/CERBADATA/vc_proj/all_outputs/joint_vcf/vep/manually_parsed_corr_3.selected.vcf'
In [31]:
with open(file) as f:
    #df = pd.DataFrame(columns=['gene', 'rsid', 'zygosity', 'clinvar', 'gnomad_AF', 'AC', 'AN', 'p_val'])
    df = []

    for line in f:
        
        if line.startswith('#'):
            continue
        else:
            clinvar_sig = re.findall('clinvar_significance=([^;]+?;)', line)[0].strip(';')
            if clinvar_sig in ['pathogenic', 'likely_pathogenic', 'pathogenic&likely_pathogenic']:
                g_AF = float(re.findall('gnomAD_NFE_AF=([^;]+)', line)[0]) # p_success
                s_AC = sum([int(x) for x in re.findall('AC=([^;]+)', line)[0].split(',')]) # n_success
                s_AN = int(re.findall('AN=([^;]+)', line)[0]) # sample_size

                zygosity = re.findall('\t(1\/1)', line)
                
                gene_n = re.findall('gene_name=([^;]+)', line)[0] # sample_size
                rs = re.findall('\t(rs\d*)\t', line)
                p_value = ss.binom.pmf(s_AC, s_AN, g_AF, loc=0)
                                                                  
                df.append((gene_n, rs, zygosity, clinvar_sig, g_AF, s_AC, s_AN, p_value))

                
In [32]:
data = pd.DataFrame(df)
data.columns = ['gene_name', 'rs', 'zygosity', 'clinvar_sig', 'g_AF', 's_AC', 's_AN', 'p_value']
In [33]:
# I am converting zygosity column as a string and taking first element
data.zygosity = data.zygosity.str[0]
In [34]:
# this line is to replace any non homozygous samples represented as 0, basically every condition that is
# not homozygous is represented as NaN
data["zygosity"] = data["zygosity"].fillna(0)
In [35]:
# now we can filter the condition 1/1 i.e. homozygous
data = data[(data[['zygosity']] == '1/1').all(axis=1)]

We should have the following filters AC should be greater than 0 gnomAD_AF should also be greater than 0 p-value, in turn, should be less than 0.01

In [36]:
data = data[(data[['p_value']] <= 0.01).all(axis=1)]
data = data[(data[['g_AF']] <= 0.005).all(axis=1)]
data = data[(data[['s_AC']] != 0).all(axis=1)]
data = data[(data[['g_AF']] != 0).all(axis=1)]
In [37]:
data.rs = data.rs.str[0]
In [38]:
# Sort the rows of dataframe by column 'g_AF'

data = data.sort_values(by ='s_AC', ascending=False)
In [39]:
data
Out[39]:
gene_name rs zygosity clinvar_sig g_AF s_AC s_AN p_value
643 PKD1 rs199476099 1/1 pathogenic 0.004165 35 1624 1.005320e-14
598 OCA2 rs74653330 1/1 pathogenic 0.002681 29 1676 8.952818e-15
537 PAH rs5030858 1/1 pathogenic 0.001477 24 1680 3.508603e-16
838 MAP3K15 rs2229137 1/1 pathogenic 0.000415 18 1680 1.078902e-19
845 AR rs137852591 1/1 pathogenic 0.002374 16 1674 3.345608e-06
409 FOXE1 rs538912281 1/1 pathogenic 0.003182 12 1536 2.888287e-03
859 SOX3 rs200361128 1/1 likely_pathogenic 0.001398 9 1674 5.528962e-04
837 ARSE rs201424543 1/1 likely_pathogenic 0.000654 6 1674 7.982879e-04
854 COL4A5 rs104886192 1/1 pathogenic 0.000186 6 1674 9.084699e-07
422 RPL7A rs782316919 1/1 pathogenic 0.000150 5 1680 6.568766e-06
519 LRRK2 rs34637584 1/1 pathogenic 0.000255 4 1678 9.145168e-04
456 SLC22A18 rs78838117 1/1 pathogenic 0.000075 3 1460 1.956913e-04
844 AR rs137852593 1/1 pathogenic 0.000270 3 1672 9.771392e-03
401 GALT rs111033695 1/1 pathogenic 0.000044 2 1680 2.530572e-03
501 PUS3 rs774005569 1/1 pathogenic 0.000026 2 1078 3.923755e-04
493 ATM rs780905851 1/1 likely_pathogenic 0.000009 2 1680 1.079089e-04
177 UGT1A6 rs34993780 1/1 pathogenic 0.000026 2 1676 9.387703e-04
In [40]:
# loading omim tables

omim = pd.read_csv('/home/mrinalv/project/use_omim_table.txt', sep='\t')
In [41]:
omim.head()
Out[41]:
genes hgnc_synonyms hgnc_genes phenotype phenotypeInheritance geneMimNumber phenotypeMimNumber chromosome comments
0 CMM|MLM|DNS CMM|CDKN2A|CMM CMM,CDKN2A Melanoma, cutaneous malignant, 1 Autosomal dominant 155600 155600.0 1 some linkage studies negative; see 9p
1 CTRCT8|CCV CCV|CCV CCV Cataract 8, multiple types Autosomal dominant 115665 115665.0 1 linked to Rh in Scottish family
2 DYX8 DYX8 DYX8 Dyslexia, susceptibility to, 8 Autosomal dominant; Multifactorial 608995 608995.0 1 between D1S552 and D1S1622
3 IBD7 IBD7 IBD7 Inflammatory bowel disease 7 None 605225 605225.0 1 associated with rs6426833
4 MYP14 MYP14 MYP14 Myopia 14 None 610320 610320.0 1 between D1S552 and D1S1622
In [42]:
# explode a dataframe column

def explode(df, lst_cols, fill_value='', preserve_index=False):
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res
In [43]:
omim['genes'] = omim['genes'].astype(str)
omim = pd.DataFrame(explode(omim.assign(genes=omim.genes.str.split('|')), 'genes'))

omim = omim.rename(columns={'genes': 'gene_name'})
omim.head()
Out[43]:
chromosome comments geneMimNumber hgnc_genes hgnc_synonyms phenotype phenotypeInheritance phenotypeMimNumber gene_name
0 1 some linkage studies negative; see 9p 155600 CMM,CDKN2A CMM|CDKN2A|CMM Melanoma, cutaneous malignant, 1 Autosomal dominant 155600.0 CMM
1 1 some linkage studies negative; see 9p 155600 CMM,CDKN2A CMM|CDKN2A|CMM Melanoma, cutaneous malignant, 1 Autosomal dominant 155600.0 MLM
2 1 some linkage studies negative; see 9p 155600 CMM,CDKN2A CMM|CDKN2A|CMM Melanoma, cutaneous malignant, 1 Autosomal dominant 155600.0 DNS
3 1 linked to Rh in Scottish family 115665 CCV CCV|CCV Cataract 8, multiple types Autosomal dominant 115665.0 CTRCT8
4 1 linked to Rh in Scottish family 115665 CCV CCV|CCV Cataract 8, multiple types Autosomal dominant 115665.0 CCV
In [44]:
new_data = pd.merge(data, omim, on="gene_name")
new_data.head()
Out[44]:
gene_name rs zygosity clinvar_sig g_AF s_AC s_AN p_value chromosome comments geneMimNumber hgnc_genes hgnc_synonyms phenotype phenotypeInheritance phenotypeMimNumber
0 PKD1 rs199476099 1/1 pathogenic 0.004165 35 1624 1.005320e-14 16 NaN 601313 PKD1 PKD1 Polycystic kidney disease, adult type I Autosomal dominant 173900.0
1 OCA2 rs74653330 1/1 pathogenic 0.002681 29 1676 8.952818e-15 15 ?hypopigmentation in PWS and AS 611409 HCL3,PEA15,MESDC2,OCA2 OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA Albinism, brown oculocutaneous Autosomal recessive 203200.0
2 OCA2 rs74653330 1/1 pathogenic 0.002681 29 1676 8.952818e-15 15 ?hypopigmentation in PWS and AS 611409 HCL3,PEA15,MESDC2,OCA2 OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA Skin/hair/eye pigmentation 1, blue/nonblue eyes Autosomal recessive 227220.0
3 OCA2 rs74653330 1/1 pathogenic 0.002681 29 1676 8.952818e-15 15 ?hypopigmentation in PWS and AS 611409 HCL3,PEA15,MESDC2,OCA2 OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA Skin/hair/eye pigmentation 1, blond/brown hair Autosomal recessive 227220.0
4 OCA2 rs74653330 1/1 pathogenic 0.002681 29 1676 8.952818e-15 15 ?hypopigmentation in PWS and AS 611409 HCL3,PEA15,MESDC2,OCA2 OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA Albinism, oculocutaneous, type II Autosomal recessive 203200.0
In [45]:
new_data
Out[45]:
gene_name rs zygosity clinvar_sig g_AF s_AC s_AN p_value chromosome comments geneMimNumber hgnc_genes hgnc_synonyms phenotype phenotypeInheritance phenotypeMimNumber
0 PKD1 rs199476099 1/1 pathogenic 0.004165 35 1624 1.005320e-14 16 NaN 601313 PKD1 PKD1 Polycystic kidney disease, adult type I Autosomal dominant 173900.0
1 OCA2 rs74653330 1/1 pathogenic 0.002681 29 1676 8.952818e-15 15 ?hypopigmentation in PWS and AS 611409 HCL3,PEA15,MESDC2,OCA2 OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA Albinism, brown oculocutaneous Autosomal recessive 203200.0
2 OCA2 rs74653330 1/1 pathogenic 0.002681 29 1676 8.952818e-15 15 ?hypopigmentation in PWS and AS 611409 HCL3,PEA15,MESDC2,OCA2 OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA Skin/hair/eye pigmentation 1, blue/nonblue eyes Autosomal recessive 227220.0
3 OCA2 rs74653330 1/1 pathogenic 0.002681 29 1676 8.952818e-15 15 ?hypopigmentation in PWS and AS 611409 HCL3,PEA15,MESDC2,OCA2 OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA Skin/hair/eye pigmentation 1, blond/brown hair Autosomal recessive 227220.0
4 OCA2 rs74653330 1/1 pathogenic 0.002681 29 1676 8.952818e-15 15 ?hypopigmentation in PWS and AS 611409 HCL3,PEA15,MESDC2,OCA2 OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA Albinism, oculocutaneous, type II Autosomal recessive 203200.0
5 PAH rs5030858 1/1 pathogenic 0.001477 24 1680 3.508603e-16 12 close to IGF1 612349 PAH PAH|NA Hyperphenylalaninemia, non-PKU mild Autosomal recessive 261600.0
6 PAH rs5030858 1/1 pathogenic 0.001477 24 1680 3.508603e-16 12 close to IGF1 612349 PAH PAH|NA Phenylketonuria Autosomal recessive 261600.0
7 AR rs137852591 1/1 pathogenic 0.002374 16 1674 3.345608e-06 X NaN 313700 AKR1B1,AR AKR1B1,AR|AR|NA|AR|NA|AR|NA Androgen insensitivity X-linked recessive 300068.0
8 AR rs137852591 1/1 pathogenic 0.002374 16 1674 3.345608e-06 X NaN 313700 AKR1B1,AR AKR1B1,AR|AR|NA|AR|NA|AR|NA Prostate cancer, susceptibility to Autosomal dominant 176807.0
9 AR rs137852591 1/1 pathogenic 0.002374 16 1674 3.345608e-06 X NaN 313700 AKR1B1,AR AKR1B1,AR|AR|NA|AR|NA|AR|NA Spinal and bulbar muscular atrophy of Kennedy X-linked recessive 313200.0
10 AR rs137852591 1/1 pathogenic 0.002374 16 1674 3.345608e-06 X NaN 313700 AKR1B1,AR AKR1B1,AR|AR|NA|AR|NA|AR|NA Hypospadias 1, X-linked X-linked recessive 300633.0
11 AR rs137852591 1/1 pathogenic 0.002374 16 1674 3.345608e-06 X NaN 313700 AKR1B1,AR AKR1B1,AR|AR|NA|AR|NA|AR|NA Androgen insensitivity, partial, with or witho... X-linked recessive 312300.0
12 AR rs137852593 1/1 pathogenic 0.000270 3 1672 9.771392e-03 X NaN 313700 AKR1B1,AR AKR1B1,AR|AR|NA|AR|NA|AR|NA Androgen insensitivity X-linked recessive 300068.0
13 AR rs137852593 1/1 pathogenic 0.000270 3 1672 9.771392e-03 X NaN 313700 AKR1B1,AR AKR1B1,AR|AR|NA|AR|NA|AR|NA Prostate cancer, susceptibility to Autosomal dominant 176807.0
14 AR rs137852593 1/1 pathogenic 0.000270 3 1672 9.771392e-03 X NaN 313700 AKR1B1,AR AKR1B1,AR|AR|NA|AR|NA|AR|NA Spinal and bulbar muscular atrophy of Kennedy X-linked recessive 313200.0
15 AR rs137852593 1/1 pathogenic 0.000270 3 1672 9.771392e-03 X NaN 313700 AKR1B1,AR AKR1B1,AR|AR|NA|AR|NA|AR|NA Hypospadias 1, X-linked X-linked recessive 300633.0
16 AR rs137852593 1/1 pathogenic 0.000270 3 1672 9.771392e-03 X NaN 313700 AKR1B1,AR AKR1B1,AR|AR|NA|AR|NA|AR|NA Androgen insensitivity, partial, with or witho... X-linked recessive 312300.0
17 FOXE1 rs538912281 1/1 pathogenic 0.003182 12 1536 2.888287e-03 9 NaN 602617 TTF2,FOXE1 FOXE1|FOXE1|FOXE1|TTF2|NA Thyroid cancer, nonmedullary, 4 Autosomal dominant 616534.0
18 FOXE1 rs538912281 1/1 pathogenic 0.003182 12 1536 2.888287e-03 9 NaN 602617 TTF2,FOXE1 FOXE1|FOXE1|FOXE1|TTF2|NA Bamforth-Lazarus syndrome Autosomal recessive 241850.0
19 SOX3 rs200361128 1/1 likely_pathogenic 0.001398 9 1674 5.528962e-04 X P mutant in BFLS 313430 SOX3 SOX3|NA Panhypopituitarism, X-linked X-linked 312000.0
20 SOX3 rs200361128 1/1 likely_pathogenic 0.001398 9 1674 5.528962e-04 X P mutant in BFLS 313430 SOX3 SOX3|NA Mental retardation, X-linked, with isolated gr... None 300123.0
21 ARSE rs201424543 1/1 likely_pathogenic 0.000654 6 1674 7.982879e-04 X CDPX1 in contiguous gene syndrome with STS 300180 ARSE ARSE|ARSE|NA Chondrodysplasia punctata, X-linked recessive X-linked recessive 302950.0
22 COL4A5 rs104886192 1/1 pathogenic 0.000186 6 1674 9.084699e-07 X diffuse leiomyomatosis with Alport syndrome = ... 303630 COL4A5 COL4A5|COL4A5|COL4A5 Alport syndrome X-linked dominant 301050.0
23 LRRK2 rs34637584 1/1 pathogenic 0.000255 4 1678 9.145168e-04 12 NaN 609007 LRRK2 LRRK2|LRRK2 Parkinson disease 8 Autosomal dominant 607060.0
24 GALT rs111033695 1/1 pathogenic 0.000044 2 1680 2.530572e-03 9 NaN 606999 GALT GALT Galactosemia Autosomal recessive 230400.0
25 PUS3 rs774005569 1/1 pathogenic 0.000026 2 1078 3.923755e-04 11 mutation identified in 1 MRT55 family 616283 PUS3 PUS3|NA ?Mental retardation, autosomal recessive 55 Autosomal recessive 617051.0
26 ATM rs780905851 1/1 likely_pathogenic 0.000009 2 1680 1.079089e-04 11 NaN 607585 AGTR1,ATM,SLC33A1 ATM|ATM|AGTR1,SLC33A1 Lymphoma, mantle cell, somatic None NaN
27 ATM rs780905851 1/1 likely_pathogenic 0.000009 2 1680 1.079089e-04 11 NaN 607585 AGTR1,ATM,SLC33A1 ATM|ATM|AGTR1,SLC33A1 Lymphoma, B-cell non-Hodgkin, somatic None NaN
28 ATM rs780905851 1/1 likely_pathogenic 0.000009 2 1680 1.079089e-04 11 NaN 607585 AGTR1,ATM,SLC33A1 ATM|ATM|AGTR1,SLC33A1 Ataxia-telangiectasia Autosomal recessive 208900.0
29 ATM rs780905851 1/1 likely_pathogenic 0.000009 2 1680 1.079089e-04 11 NaN 607585 AGTR1,ATM,SLC33A1 ATM|ATM|AGTR1,SLC33A1 Breast cancer, susceptibility to Autosomal dominant 114480.0
30 ATM rs780905851 1/1 likely_pathogenic 0.000009 2 1680 1.079089e-04 11 NaN 607585 AGTR1,ATM,SLC33A1 ATM|ATM|AGTR1,SLC33A1 T-cell prolymphocytic leukemia, somatic None NaN
In [46]:
#new_data.to_csv(r'/media/CERBADATA/vc_proj/all_outputs/joint_vcf/vep/overrep.csv', index = False)
In [47]:
!head /media/CERBADATA/vc_proj/all_outputs/joint_vcf/vep/overrep.csv
gene_name,rs,zygosity,clinvar_sig,g_AF,s_AC,s_AN,p_value,chromosome,comments,geneMimNumber,hgnc_genes,hgnc_synonyms,phenotype,phenotypeInheritance,phenotypeMimNumber
PKD1,rs199476099,1/1,pathogenic,0.004165,35,1624,1.0053204970091002e-14,16,,601313,PKD1,PKD1,"Polycystic kidney disease, adult type I",Autosomal dominant,173900.0
OCA2,rs74653330,1/1,pathogenic,0.002681,29,1676,8.952818129144412e-15,15,?hypopigmentation in PWS and AS,611409,"HCL3,PEA15,MESDC2,OCA2",OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA,"Albinism, brown oculocutaneous",Autosomal recessive,203200.0
OCA2,rs74653330,1/1,pathogenic,0.002681,29,1676,8.952818129144412e-15,15,?hypopigmentation in PWS and AS,611409,"HCL3,PEA15,MESDC2,OCA2",OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA,"Skin/hair/eye pigmentation 1, blue/nonblue eyes",Autosomal recessive,227220.0
OCA2,rs74653330,1/1,pathogenic,0.002681,29,1676,8.952818129144412e-15,15,?hypopigmentation in PWS and AS,611409,"HCL3,PEA15,MESDC2,OCA2",OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA,"Skin/hair/eye pigmentation 1, blond/brown hair",Autosomal recessive,227220.0
OCA2,rs74653330,1/1,pathogenic,0.002681,29,1676,8.952818129144412e-15,15,?hypopigmentation in PWS and AS,611409,"HCL3,PEA15,MESDC2,OCA2",OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA,"Albinism, oculocutaneous, type II",Autosomal recessive,203200.0
PAH,rs5030858,1/1,pathogenic,0.001477,24,1680,3.508603169490813e-16,12,close to IGF1,612349,PAH,PAH|NA,"Hyperphenylalaninemia, non-PKU mild",Autosomal recessive,261600.0
PAH,rs5030858,1/1,pathogenic,0.001477,24,1680,3.508603169490813e-16,12,close to IGF1,612349,PAH,PAH|NA,Phenylketonuria,Autosomal recessive,261600.0
AR,rs137852591,1/1,pathogenic,0.002374,16,1674,3.345607627786532e-06,X,,313700,"AKR1B1,AR","AKR1B1,AR|AR|NA|AR|NA|AR|NA",Androgen insensitivity,X-linked recessive,300068.0
AR,rs137852591,1/1,pathogenic,0.002374,16,1674,3.345607627786532e-06,X,,313700,"AKR1B1,AR","AKR1B1,AR|AR|NA|AR|NA|AR|NA","Prostate cancer, susceptibility to",Autosomal dominant,176807.0
In [48]:
# now we need only Autosomal Recessive ones
In [49]:
auto_res = new_data[new_data['phenotypeInheritance'].str.contains("Autosomal recessive")]
#auto_res.to_csv(r'/media/CERBADATA/vc_proj/all_outputs/joint_vcf/vep/overrep_autores.csv', index = False)
In [50]:
!head /media/CERBADATA/vc_proj/all_outputs/joint_vcf/vep/overrep_autores.csv
gene_name,rs,zygosity,clinvar_sig,g_AF,s_AC,s_AN,p_value,chromosome,comments,geneMimNumber,hgnc_genes,hgnc_synonyms,phenotype,phenotypeInheritance,phenotypeMimNumber
OCA2,rs74653330,1/1,pathogenic,0.002681,29,1676,8.952818129144412e-15,15,?hypopigmentation in PWS and AS,611409,"HCL3,PEA15,MESDC2,OCA2",OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA,"Albinism, brown oculocutaneous",Autosomal recessive,203200.0
OCA2,rs74653330,1/1,pathogenic,0.002681,29,1676,8.952818129144412e-15,15,?hypopigmentation in PWS and AS,611409,"HCL3,PEA15,MESDC2,OCA2",OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA,"Skin/hair/eye pigmentation 1, blue/nonblue eyes",Autosomal recessive,227220.0
OCA2,rs74653330,1/1,pathogenic,0.002681,29,1676,8.952818129144412e-15,15,?hypopigmentation in PWS and AS,611409,"HCL3,PEA15,MESDC2,OCA2",OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA,"Skin/hair/eye pigmentation 1, blond/brown hair",Autosomal recessive,227220.0
OCA2,rs74653330,1/1,pathogenic,0.002681,29,1676,8.952818129144412e-15,15,?hypopigmentation in PWS and AS,611409,"HCL3,PEA15,MESDC2,OCA2",OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA,"Albinism, oculocutaneous, type II",Autosomal recessive,203200.0
PAH,rs5030858,1/1,pathogenic,0.001477,24,1680,3.508603169490813e-16,12,close to IGF1,612349,PAH,PAH|NA,"Hyperphenylalaninemia, non-PKU mild",Autosomal recessive,261600.0
PAH,rs5030858,1/1,pathogenic,0.001477,24,1680,3.508603169490813e-16,12,close to IGF1,612349,PAH,PAH|NA,Phenylketonuria,Autosomal recessive,261600.0
FOXE1,rs538912281,1/1,pathogenic,0.003182,12,1536,0.002888286502828189,9,,602617,"TTF2,FOXE1",FOXE1|FOXE1|FOXE1|TTF2|NA,Bamforth-Lazarus syndrome,Autosomal recessive,241850.0
GALT,rs111033695,1/1,pathogenic,4.395e-05,2,1680,0.00253057168476171,9,,606999,GALT,GALT,Galactosemia,Autosomal recessive,230400.0
PUS3,rs774005569,1/1,pathogenic,2.637e-05,2,1078,0.00039237546361056713,11,mutation identified in 1 MRT55 family,616283,PUS3,PUS3|NA,"?Mental retardation, autosomal recessive 55",Autosomal recessive,617051.0
In [51]:
auto_res
Out[51]:
gene_name rs zygosity clinvar_sig g_AF s_AC s_AN p_value chromosome comments geneMimNumber hgnc_genes hgnc_synonyms phenotype phenotypeInheritance phenotypeMimNumber
1 OCA2 rs74653330 1/1 pathogenic 0.002681 29 1676 8.952818e-15 15 ?hypopigmentation in PWS and AS 611409 HCL3,PEA15,MESDC2,OCA2 OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA Albinism, brown oculocutaneous Autosomal recessive 203200.0
2 OCA2 rs74653330 1/1 pathogenic 0.002681 29 1676 8.952818e-15 15 ?hypopigmentation in PWS and AS 611409 HCL3,PEA15,MESDC2,OCA2 OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA Skin/hair/eye pigmentation 1, blue/nonblue eyes Autosomal recessive 227220.0
3 OCA2 rs74653330 1/1 pathogenic 0.002681 29 1676 8.952818e-15 15 ?hypopigmentation in PWS and AS 611409 HCL3,PEA15,MESDC2,OCA2 OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA Skin/hair/eye pigmentation 1, blond/brown hair Autosomal recessive 227220.0
4 OCA2 rs74653330 1/1 pathogenic 0.002681 29 1676 8.952818e-15 15 ?hypopigmentation in PWS and AS 611409 HCL3,PEA15,MESDC2,OCA2 OCA2|OCA2|PEA15|OCA2|MESDC2|OCA2|HCL3|NA Albinism, oculocutaneous, type II Autosomal recessive 203200.0
5 PAH rs5030858 1/1 pathogenic 0.001477 24 1680 3.508603e-16 12 close to IGF1 612349 PAH PAH|NA Hyperphenylalaninemia, non-PKU mild Autosomal recessive 261600.0
6 PAH rs5030858 1/1 pathogenic 0.001477 24 1680 3.508603e-16 12 close to IGF1 612349 PAH PAH|NA Phenylketonuria Autosomal recessive 261600.0
18 FOXE1 rs538912281 1/1 pathogenic 0.003182 12 1536 2.888287e-03 9 NaN 602617 TTF2,FOXE1 FOXE1|FOXE1|FOXE1|TTF2|NA Bamforth-Lazarus syndrome Autosomal recessive 241850.0
24 GALT rs111033695 1/1 pathogenic 0.000044 2 1680 2.530572e-03 9 NaN 606999 GALT GALT Galactosemia Autosomal recessive 230400.0
25 PUS3 rs774005569 1/1 pathogenic 0.000026 2 1078 3.923755e-04 11 mutation identified in 1 MRT55 family 616283 PUS3 PUS3|NA ?Mental retardation, autosomal recessive 55 Autosomal recessive 617051.0
28 ATM rs780905851 1/1 likely_pathogenic 0.000009 2 1680 1.079089e-04 11 NaN 607585 AGTR1,ATM,SLC33A1 ATM|ATM|AGTR1,SLC33A1 Ataxia-telangiectasia Autosomal recessive 208900.0

Correlation plots

In [52]:
#dataframe for correlation plots

with open(file) as f: 
    #df = pd.DataFrame(columns = ['REX', 'g_AFR', 'g_AMR', 'g_EAS', 'g_SAS', 'g_NFE', 'g_FIN']
    n_df_all = []

    for line in f:

        if line.startswith('#'):
            continue
        else:
            clinvar_sig = re.findall('clinvar_significance=([^;]+?;)', line)[0].strip(';')
            REX = sum([float(x) for x in re.findall(';AF=([^;]+)', line)[0].split(',')])
            AFR = float(re.findall('gnomAD_AFR_AF=([^;]+)', line)[0])
            AMR = float(re.findall('gnomAD_AMR_AF=([^;]+)', line)[0]) 
            EAS = float(re.findall('gnomAD_EAS_AF=([^;]+)', line)[0]) 
            SAS = float(re.findall('gnomAD_SAS_AF=([^;]+)', line)[0]) 
            NFE = float(re.findall('gnomAD_NFE_AF=([^;]+)', line)[0]) 
            FIN = float(re.findall('gnomAD_FIN_AF=([^;]+)', line)[0])
            TOT = float(re.findall('gnomAD_AF=([^;]+)', line)[0])

            n_df_all.append((REX, AFR, AMR, EAS, SAS, NFE, FIN, TOT))

data_4 = pd.DataFrame(n_df_all)
data_4.columns = ['REX', 'g_AFR', 'g_AMR', 'g_EAS', 'g_SAS', 'g_NFE', 'g_FIN', 'TOT']
data_4 = data_4[(data_4[['TOT']] != 0).all(axis=1)]
In [53]:
def heatMap(df, mirror):

   # Create Correlation df
   corr = df.corr()
   # Plot figsize
   fig, ax = plt.subplots(figsize=(10, 10))
   # Generate Color Map
   colormap = sns.diverging_palette(220, 10, as_cmap=True)
   
   if mirror == True:
      #Generate Heat Map, allow annotations and place floats in map
      sns.heatmap(corr, cmap=colormap, annot=True, fmt=".2f")
      #Apply xticks
      plt.xticks(range(len(corr.columns)), corr.columns);
      #Apply yticks
      plt.yticks(range(len(corr.columns)), corr.columns)
      #show plot

   else:
      # Drop self-correlations
      dropSelf = np.zeros_like(corr)
      dropSelf[np.triu_indices_from(dropSelf)] = True
      # Generate Color Map
      colormap = sns.diverging_palette(220, 10, as_cmap=True)
      # Generate Heat Map, allow annotations and place floats in map
      sns.heatmap(corr, cmap=colormap, annot=True, fmt=".2f", mask=dropSelf)
      # Apply xticks
      plt.xticks(range(len(corr.columns)), corr.columns);
      # Apply yticks
      plt.yticks(range(len(corr.columns)), corr.columns)
   # show plot
   plt.show()
   
In [55]:
heatMap(data_4, mirror=False)
In [56]:
def corrdot(*args, **kwargs):
    corr_r = args[0].corr(args[1], 'pearson')
    corr_text = f"{corr_r:2.2f}".replace("0.", ".")
    ax = plt.gca()
    ax.set_axis_off()
    marker_size = abs(corr_r) * 10000
    ax.scatter([.5], [.5], marker_size, [corr_r], alpha=0.6, cmap="coolwarm",
               vmin=-1, vmax=1, transform=ax.transAxes)
    font_size = abs(corr_r) * 40 + 5
    ax.annotate(corr_text, [.5, .5,],  xycoords="axes fraction",
                ha='center', va='center', fontsize=font_size)

sns.set(style='white', font_scale=1.6)
iris = data_4.sample(10000)
g = sns.PairGrid(iris, aspect=1.4, diag_sharey=False)
g.map_lower(sns.regplot, lowess=True, ci=False, line_kws={'color': 'black'})
g.map_diag(sns.distplot, kde_kws={'color': 'black'})
g.map_upper(corrdot)
Out[56]:
<seaborn.axisgrid.PairGrid at 0x7f2c65f829d0>